Code
import pandas as pd
xlsx_path = "/home/ubuntu/ad688-employability-sp25A1-group8-1/data/employment_gender.xlsx"
job_posting_path = "/home/ubuntu/github-classroom/met-ad-688/assignment-03-zimozeng12/lightcast_job_postings.csv"
df_gender = pd.read_excel(xlsx_path, sheet_name="Sheet1", engine="openpyxl")
df_gender["female_ratio"] = df_gender["women"] / df_gender["total"]
df_jobs = pd.read_csv(job_posting_path, low_memory=False)
df_jobs["NAICS2"] = pd.to_numeric(df_jobs["NAICS2"], errors="coerce")
naics_to_occupation = {
11: "Farming, fishing, and forestry occupations",
21: "Natural resources, construction, and maintenance occupations",
22: "Production, transportation, and material moving occupations",
23: "Construction and extraction occupations",
31: "Production, transportation, and material moving occupations",
42: "Sales and office occupations",
44: "Sales and office occupations",
48: "Production, transportation, and material moving occupations",
51: "Computer and mathematical occupations",
52: "Business and financial operations occupations",
53: "Sales and office occupations",
54: "Professional and related occupations",
55: "Management occupations",
56: "Office and administrative support occupations",
61: "Education, training, and library occupations",
62: "Healthcare practitioners and technical occupations",
71: "Arts, design, entertainment, sports, and media occupations",
72: "Food preparation and serving related occupations",
81: "Personal care and service occupations",
92: "Public Administration",
99: "Unclassified"
}
df_jobs["Occupation"] = df_jobs["NAICS2"].map(naics_to_occupation)
df_merged = df_jobs.merge(
df_gender[["occupation", "female_ratio"]],
left_on="Occupation", right_on="occupation", how="left"
)
df_cleaned = (
df_merged[["NAICS2_NAME", "Occupation", "female_ratio"]]
.dropna()
.sort_values("female_ratio", ascending=False)
.drop_duplicates(subset="Occupation", keep="first")
.reset_index(drop=True)
)
from IPython.display import display
display(df_cleaned)| NAICS2_NAME | Occupation | female_ratio | |
|---|---|---|---|
| 0 | Health Care and Social Assistance | Healthcare practitioners and technical occupat... | 0.758788 |
| 1 | Other Services (except Public Administration) | Personal care and service occupations | 0.748341 |
| 2 | Educational Services | Education, training, and library occupations | 0.727640 |
| 3 | Administrative and Support and Waste Managemen... | Office and administrative support occupations | 0.712298 |
| 4 | Wholesale Trade | Sales and office occupations | 0.605601 |
| 5 | Professional, Scientific, and Technical Services | Professional and related occupations | 0.565025 |
| 6 | Finance and Insurance | Business and financial operations occupations | 0.539946 |
| 7 | Accommodation and Food Services | Food preparation and serving related occupations | 0.539016 |
| 8 | Arts, Entertainment, and Recreation | Arts, design, entertainment, sports, and media... | 0.480161 |
| 9 | Management of Companies and Enterprises | Management occupations | 0.419353 |
| 10 | Agriculture, Forestry, Fishing and Hunting | Farming, fishing, and forestry occupations | 0.270517 |
| 11 | Information | Computer and mathematical occupations | 0.268840 |
| 12 | Manufacturing | Production, transportation, and material movin... | 0.249274 |
| 13 | Mining, Quarrying, and Oil and Gas Extraction | Natural resources, construction, and maintenan... | 0.058216 |
| 14 | Construction | Construction and extraction occupations | 0.043041 |
